import requests
from bs4 import BeautifulSoup

import pymysql

#connect=pymysql.connect(user='root',password='Kpds@512',host='localhost',port=3306,db='spiders',charset='utf8')
connect=pymysql.connect(user='root',password='Ts123456',host='221.224.198.138',port=3306,db='spiders',charset='utf8')
conn=connect.cursor()

conn.execute("use spiders;")
sql = "CREATE TABLE IF NOT EXISTS spider_bjx(Id Int Unsigned Auto_Increment,main_title text,keyword text,content text,sub_title text,from_url Varchar(255),fbrq Varchar(20),Primary Key(`Id`));"
conn.execute(sql)
# conn.execute('TRUNCATE table spider_bjx;')
# conn.execute(sql)
connect.commit()
# 主站

url = 'http://fd.bjx.com.cn/fdsbycl/'

# 模拟浏览器访问
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# 发送请求
response = requests.get(url, headers=headers)

# 获取BeautifulSoup对象


#soup = BeautifulSoup(response.text, 'html.parser')
#soup = BeautifulSoup(response.text, 'lxml')
soup = BeautifulSoup(response.text, "html5lib")

#print(soup.prettify())#格式化代码，自动补全代码，进行容错的处理
#print(soup.title.string)#打印出title标签中的内容

# 解析出符合要求的a标签
#video_list = soup.find_all('div', {'class':'list_main'})
video_list = soup.find_all('ul', {'class':'list_left_ul'})


# video_list = soup.find_all('a')
# video_list = soup.find_all('a',target='_blank')
#video_list=soup.find_all("a", limit=2)

#video_list=soup.find_all("a", limit=20)


# 遍历标签
for video in video_list:
    # 获取herf并组拼成完整的url
    #video_url = video['href']
    #video_url = video['title']
    #video_url = url + video_url
    #print(video_url)
    # print('li标签：', video)
    # for a in video.find_all('span'):
    #     print(a)
    for aa in video.find_all('a'):
        # print(aa)
        # print(aa.get_text())
        # print(aa.find_next_sibling('span').get_text())
        # print(aa['href'])
        url1=aa['href']
        sub=""
        # 读取html文件
        response1 = requests.get(url1, headers=headers)
        # 创建BeautifulSoup对象，第一个参数为解析的字符串，第二个参数为解析器
        soup1 = BeautifulSoup(response1.text, "html5lib")
        keyword = soup1.find('div', {'class': 'tempa list_key btemp'})
        print(keyword.get_text())
        list_detail = soup1.find('div', {'class': 'list_detail'})
        print(list_detail.text)
        find_all_strong = soup1.find_all('strong')
        for aaa in find_all_strong:
            sub = sub+aaa.text
        conn.execute("insert into spider_bjx(main_title,sub_title,keyword,content,from_url,fbrq) VALUES (%s,%s,%s,%s,%s,%s)",(aa.get_text(),sub,keyword.get_text(),list_detail.text,aa['href'],aa.find_next_sibling('span').get_text() ))
        connect.commit()


connect.close()